Gender prediction on USA baby names

Download USA SSN data for top 100 baby names by gender per state from: https://www.ssa.gov/oact/babynames/limits.html

Specifically download the file, which has all the names data: https://www.ssa.gov/oact/babynames/state/namesbystate.zip

  mkdir data
  wget https://www.ssa.gov/oact/babynames/state/namesbystate.zip
  mkdir SSN_data
  cd SSN_data
  unzip ../namesbystate.zip
  cd ..
  cat SSN_data/*.TXT > data/SSN_names_state.txt

Download average state latitude and longitude from: http://dev.maxmind.com/geoip/legacy/codes/state_latlon/
File name: https://dev.maxmind.com/static/csv/codes/state_latlon.csv



In [1]:

    
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")



In [2]:

    
df = pd.read_csv("data/SSN_names_state.txt", header=None)



In [3]:

    
df.head()



In [4]:

    
df.columns = ["State", "Gender", "Year", "Name", "Count"]



In [5]:

    
df.head()



In [6]:

    
df.shape









    Out[6]:





(5647426, 5)



In [7]:

    
df["Gender"].value_counts() # Instances per gender









    Out[7]:





F    3154009
M    2493417
dtype: int64



In [8]:

    
# Calculate number of names of each gender per year
df_t = df.pivot_table(values="Count", index="Year", columns="Gender", aggfunc=np.sum)
df_t.sum(axis=0)









    Out[8]:





Gender
F    143770075
M    155113251
dtype: int64



In [9]:

    
df_a = df_t.div(df_t.sum(axis=1), axis=0)*100
plt.plot(df_a.index, df_a["F"], label="Female", marker="o")
plt.plot(df_a.index, df_a["M"], label="Male", marker="s")
plt.xlabel("Year")
plt.ylabel("Percentage of names")
plt.legend(frameon=True, fancybox=True)









    Out[9]:





<matplotlib.legend.Legend at 0x7f93bc1235d0>



In [10]:

    
# Get top gender names per state
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot_table(values=["Name", "Count", "Year"], columns="Gender", index="State", aggfunc=lambda x: x.iloc[0])\
.swaplevel(0, 1, axis=1).sort_index(axis=1)









    Out[10]:






  
    
      Gender
      F
      M
    
    
      
      Count
      Name
      Year
      Count
      Name
      Year
    
    
      State
      
      
      
      
      
      
    
  
  
    
      AK
      164
      Jessica
      1985
      207
      Michael
      1982
    
    
      AL
      2694
      Mary
      1925
      3038
      James
      1947
    
    
      AR
      1424
      Linda
      1948
      1620
      James
      1947
    
    
      AZ
      875
      Jessica
      1987
      1045
      Michael
      1984
    
    
      CA
      6949
      Jessica
      1991
      8268
      Michael
      1956
    
    
      CO
      1036
      Linda
      1948
      973
      Michael
      1970
    
    
      CT
      1155
      Linda
      1947
      1622
      Robert
      1947
    
    
      DC
      615
      Linda
      1949
      884
      John
      1947
    
    
      DE
      207
      Jennifer
      1974
      311
      Michael
      1958
    
    
      FL
      2739
      Jessica
      1987
      3491
      Michael
      1990
    
    
      GA
      2614
      Mary
      1922
      3066
      James
      1947
    
    
      HI
      243
      Jennifer
      1981
      325
      Michael
      1957
    
    
      IA
      2365
      Linda
      1947
      1667
      Robert
      1924
    
    
      ID
      530
      Linda
      1947
      348
      Michael
      1954
    
    
      IL
      5285
      Linda
      1947
      6220
      Michael
      1957
    
    
      IN
      3012
      Linda
      1947
      2781
      Michael
      1954
    
    
      KS
      1387
      Linda
      1947
      1184
      Michael
      1954
    
    
      KY
      2537
      Mary
      1924
      2553
      James
      1947
    
    
      LA
      1648
      Linda
      1947
      1667
      Michael
      1958
    
    
      MA
      3035
      Mary
      1921
      3895
      Robert
      1947
    
    
      MD
      1505
      Linda
      1947
      1789
      Michael
      1957
    
    
      ME
      778
      Linda
      1947
      649
      Robert
      1947
    
    
      MI
      4538
      Linda
      1947
      4924
      Michael
      1957
    
    
      MN
      2144
      Linda
      1948
      2043
      David
      1955
    
    
      MO
      2797
      Linda
      1947
      2430
      James
      1947
    
    
      MS
      1840
      Mary
      1927
      2278
      James
      1947
    
    
      MT
      472
      Linda
      1948
      403
      John
      1947
    
    
      NC
      3126
      Mary
      1924
      3884
      James
      1947
    
    
      ND
      496
      Linda
      1947
      344
      David
      1952
    
    
      NE
      1055
      Linda
      1947
      937
      Robert
      1924
    
    
      NH
      437
      Linda
      1947
      447
      Robert
      1947
    
    
      NJ
      2398
      Jennifer
      1972
      3699
      John
      1964
    
    
      NM
      589
      Mary
      1947
      543
      David
      1961
    
    
      NV
      268
      Jessica
      1988
      341
      Michael
      1990
    
    
      NY
      7542
      Linda
      1947
      10023
      Robert
      1947
    
    
      OH
      5881
      Linda
      1947
      5492
      David
      1955
    
    
      OK
      2064
      Linda
      1947
      1273
      James
      1925
    
    
      OR
      1121
      Linda
      1947
      984
      Michael
      1953
    
    
      PA
      8184
      Mary
      1918
      8048
      John
      1947
    
    
      RI
      485
      Mary
      1921
      682
      Robert
      1947
    
    
      SC
      1742
      Mary
      1924
      2314
      James
      1947
    
    
      SD
      507
      Linda
      1948
      413
      Robert
      1925
    
    
      TN
      2785
      Linda
      1947
      3279
      James
      1947
    
    
      TX
      5060
      Linda
      1947
      4983
      Christopher
      1985
    
    
      UT
      702
      Jennifer
      1972
      657
      Michael
      1980
    
    
      VA
      2307
      Linda
      1948
      2676
      James
      1947
    
    
      VT
      325
      Linda
      1947
      262
      David
      1955
    
    
      WA
      1751
      Linda
      1948
      1620
      Michael
      1954
    
    
      WI
      2399
      Mary
      1954
      2400
      David
      1955
    
    
      WV
      1795
      Linda
      1947
      1682
      James
      1947
    
    
      WY
      229
      Linda
      1948
      194
      David
      1955



In [11]:

    
# Get top gender names per state
# Same as above but more efficient syntax
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot(index="State", columns="Gender").swaplevel(0, 1, axis=1).sort_index(axis=1)









    Out[11]:






  
    
      Gender
      F
      M
    
    
      
      Count
      Name
      Year
      Count
      Name
      Year
    
    
      State
      
      
      
      
      
      
    
  
  
    
      AK
      164
      Jessica
      1985
      207
      Michael
      1982
    
    
      AL
      2694
      Mary
      1925
      3038
      James
      1947
    
    
      AR
      1424
      Linda
      1948
      1620
      James
      1947
    
    
      AZ
      875
      Jessica
      1987
      1045
      Michael
      1984
    
    
      CA
      6949
      Jessica
      1991
      8268
      Michael
      1956
    
    
      CO
      1036
      Linda
      1948
      973
      Michael
      1970
    
    
      CT
      1155
      Linda
      1947
      1622
      Robert
      1947
    
    
      DC
      615
      Linda
      1949
      884
      John
      1947
    
    
      DE
      207
      Jennifer
      1974
      311
      Michael
      1958
    
    
      FL
      2739
      Jessica
      1987
      3491
      Michael
      1990
    
    
      GA
      2614
      Mary
      1922
      3066
      James
      1947
    
    
      HI
      243
      Jennifer
      1981
      325
      Michael
      1957
    
    
      IA
      2365
      Linda
      1947
      1667
      Robert
      1924
    
    
      ID
      530
      Linda
      1947
      348
      Michael
      1954
    
    
      IL
      5285
      Linda
      1947
      6220
      Michael
      1957
    
    
      IN
      3012
      Linda
      1947
      2781
      Michael
      1954
    
    
      KS
      1387
      Linda
      1947
      1184
      Michael
      1954
    
    
      KY
      2537
      Mary
      1924
      2553
      James
      1947
    
    
      LA
      1648
      Linda
      1947
      1667
      Michael
      1958
    
    
      MA
      3035
      Mary
      1921
      3895
      Robert
      1947
    
    
      MD
      1505
      Linda
      1947
      1789
      Michael
      1957
    
    
      ME
      778
      Linda
      1947
      649
      Robert
      1947
    
    
      MI
      4538
      Linda
      1947
      4924
      Michael
      1957
    
    
      MN
      2144
      Linda
      1948
      2043
      David
      1955
    
    
      MO
      2797
      Linda
      1947
      2430
      James
      1947
    
    
      MS
      1840
      Mary
      1927
      2278
      James
      1947
    
    
      MT
      472
      Linda
      1948
      403
      John
      1947
    
    
      NC
      3126
      Mary
      1924
      3884
      James
      1947
    
    
      ND
      496
      Linda
      1947
      344
      David
      1952
    
    
      NE
      1055
      Linda
      1947
      937
      Robert
      1924
    
    
      NH
      437
      Linda
      1947
      447
      Robert
      1947
    
    
      NJ
      2398
      Jennifer
      1972
      3699
      John
      1964
    
    
      NM
      589
      Mary
      1947
      543
      David
      1961
    
    
      NV
      268
      Jessica
      1988
      341
      Michael
      1990
    
    
      NY
      7542
      Linda
      1947
      10023
      Robert
      1947
    
    
      OH
      5881
      Linda
      1947
      5492
      David
      1955
    
    
      OK
      2064
      Linda
      1947
      1273
      James
      1925
    
    
      OR
      1121
      Linda
      1947
      984
      Michael
      1953
    
    
      PA
      8184
      Mary
      1918
      8048
      John
      1947
    
    
      RI
      485
      Mary
      1921
      682
      Robert
      1947
    
    
      SC
      1742
      Mary
      1924
      2314
      James
      1947
    
    
      SD
      507
      Linda
      1948
      413
      Robert
      1925
    
    
      TN
      2785
      Linda
      1947
      3279
      James
      1947
    
    
      TX
      5060
      Linda
      1947
      4983
      Christopher
      1985
    
    
      UT
      702
      Jennifer
      1972
      657
      Michael
      1980
    
    
      VA
      2307
      Linda
      1948
      2676
      James
      1947
    
    
      VT
      325
      Linda
      1947
      262
      David
      1955
    
    
      WA
      1751
      Linda
      1948
      1620
      Michael
      1954
    
    
      WI
      2399
      Mary
      1954
      2400
      David
      1955
    
    
      WV
      1795
      Linda
      1947
      1682
      James
      1947
    
    
      WY
      229
      Linda
      1948
      194
      David
      1955



In [12]:

    
# Do for full data:
df_props = df.pivot_table(index=["Name", "State", "Year"], columns="Gender")["Count"].reset_index().fillna(0)



In [13]:

    
df_props.head()



In [14]:

    
df_props.columns









    Out[14]:





Index([u'Name', u'State', u'Year', u'F', u'M'], dtype='object', name=u'Gender')



In [15]:

    
df_props.shape, df.shape









    Out[15]:





((5448518, 5), (5647426, 5))



In [16]:

    
# Get top 20 names which have occured in the maximum number of years
df.groupby("Name")["Year"].apply(lambda x: len(x.unique())).sort(ascending=False, inplace=False).head(20)









    Out[16]:





Name
Antoinette    105
Nellie        105
Dolores       105
Ivory         105
Ivy           105
Dixie         105
Nina          105
Amos          105
Jack          105
Jackie        105
Jackson       105
Nick          105
Jacob         105
Nicholas      105
Jacqueline    105
Nelson        105
Neil          105
Grant         105
Neal          105
Nathaniel     105
Name: Year, dtype: int64



In [17]:

    
df_props["Total"] = df_props[["F", "M"]].sum(axis=1)
df_props["MaxCount"] = df_props[["F", "M"]].max(axis=1)
df_props.head()



In [31]:

    
# Write names per gender to file
df_props.to_csv("data/ssn_namesbystate_counts.txt", sep="\t", index=False)



In [18]:

    
(df_props[["F", "M"]].div(df_props["Total"], axis=0).max(axis=1) > 0.9).mean()









    Out[18]:





0.97653949202333556



In [19]:

    
df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1).head()









    Out[19]:





0    M
1    M
2    M
3    M
4    M
dtype: object



In [20]:

    
df_props["BestGender"] = df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1)



In [21]:

    
df_props.head()



In [22]:

    
df["Name"].unique().shape









    Out[22]:





(30274,)



In [23]:

    
df_props[df_props["Total"] > 100].shape









    Out[23]:





(575161, 8)



In [24]:

    
df[(df["Name"] == "Aaban") & (df["State"] == "NY") & (df["Year"] == 2013)]









    Out[24]:






  
    
      
      State
      Gender
      Year
      Name
      Count
    
  
  
    
      3865866
      NY
      M
      2013
      Aaban
      6



In [25]:

    
df_props.describe()









    Out[25]:






  
    
      
      Year
      F
      M
      Total
      MaxCount
    
  
  
    
      count
      5448518.000000
      5448518.000000
      5448518.000000
      5448518.000000
      5448518.000000
    
    
      mean
      1972.316033
      26.387006
      28.468888
      54.855894
      54.256168
    
    
      std
      29.617741
      109.590313
      152.552096
      185.000905
      183.863425
    
    
      min
      1910.000000
      0.000000
      0.000000
      5.000000
      5.000000
    
    
      25%
      1949.000000
      0.000000
      0.000000
      7.000000
      7.000000
    
    
      50%
      1977.000000
      6.000000
      0.000000
      13.000000
      13.000000
    
    
      75%
      1999.000000
      15.000000
      12.000000
      35.000000
      35.000000
    
    
      max
      2014.000000
      8184.000000
      10023.000000
      10046.000000
      10023.000000



In [26]:

    
df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].shape









    Out[26]:





(59777, 8)



In [27]:

    
# Filter names to only include names which occur in 500 times per year and are 100% of a given gender
df_props_filtered = df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].copy()
df_props_filtered.to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)



In [28]:

    
# Create one dataset for getting only names and genders

df_names = df_props.groupby("Name")[["F", "M"]].sum().reset_index()
df_names.head()



In [29]:

    
df_names["Total"] = df_names[["F", "M"]].sum(axis=1)
df_names["MaxCount"] = df_names[["F", "M"]].max(axis=1)
df_names["BestGender"] = df_names[["F", "M"]].div(df_names["Total"], axis=0).idxmax(axis=1)
df_names.head()



In [93]:

    
# First add start and end characters to identify beginning and end of chars
df_props_filtered["Name_proc"] = df_props_filtered.Name.apply(lambda x: ("^%s$" % x.strip()))
df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)


df_names["Name_proc"] = df_names.Name.apply(lambda x: "^%s$" % x.strip())
#df_names.head()
df_names[["Name","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_names.txt", sep="\t", index=False)

# Get latitude and longitudes for each state
df_states_latlong = pd.read_csv("data/state_latlon.csv")

df_states_latlong.columns = ["State", "Latitude", "Longitude"]

df_names_latlong = pd.merge(df_props_filtered, df_states_latlong, how="left", on="State")
#pd.merge(df, df_states_latlong, how="left", on="State").to_csv("data/SSN_names_state.latlong.txt", index=False, sep="\t")



In [95]:

    
df_props_year = df_props.groupby(["Name", "Year"]).sum().reset_index()



In [96]:

    
print df_props_year.shape
df_props_year.head()









    



(548154, 6)






    Out[96]:






  
    
      Gender
      Name
      Year
      F
      M
      Total
      MaxCount
    
  
  
    
      0
      Aaban
      2013
      0
      6
      6
      6
    
    
      1
      Aaban
      2014
      0
      6
      6
      6
    
    
      2
      Aadan
      2008
      0
      12
      12
      12
    
    
      3
      Aadan
      2009
      0
      6
      6
      6
    
    
      4
      Aadan
      2014
      0
      5
      5
      5



In [99]:

    
df_props_year["BestGender"] = df_props_year[["F", "M"]].div(df_props_year["Total"], axis=0).idxmax(axis=1)
df_props_year.head()



In [104]:

    
gg = df_props_year[df_props_year.Total > 1000].groupby(["Year", "BestGender"])



In [109]:

    
#gg.filter(lambda x: x.Total > (np.random.rand(x.Total.shape[0])*x.Total.max())).head()

Training the model on the filtered data

We will first use a simple logisitic regression model char features



In [34]:

    
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_names.Name_proc.values)









    Out[34]:





CountVectorizer(analyzer='char', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(2, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)



In [35]:

    
cvec.transform(df_names.Name_proc.values[:10])









    Out[35]:





<10x6895 sparse matrix of type '<type 'numpy.int64'>'
	with 133 stored elements in Compressed Sparse Row format>



In [36]:

    
model = LogisticRegression(penalty="l1")

np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape

X_train, y_train = cvec.transform(df_names.iloc[train_ids].Name_proc.values), df_names.iloc[train_ids].BestGender
X_test, y_test = cvec.transform(df_names.iloc[test_ids].Name_proc.values), df_names.iloc[test_ids].BestGender

#X_train, X_test, y_train, y_test = cross_validation.train_test_split(cvec.transform(df_names.Name_proc.values),df_names.BestGender, test_size=0.4, random_state=100)

model.fit(X_train, y_train)









    Out[36]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)



In [37]:

    
X_train.shape, y_train.shape, X_test.shape, y_test.shape









    Out[37]:





((18164, 6895), (18164,), (12110, 6895), (12110,))



In [38]:

    
y_pred = model.predict(X_test)



In [39]:

    
print classification_report(y_pred, y_test)









    



             precision    recall  f1-score   support

          F       0.90      0.91      0.90      7330
          M       0.86      0.84      0.85      4780

avg / total       0.88      0.88      0.88     12110



In [42]:

    
pd.concat((df_names.iloc[test_ids][["Name", "BestGender"]], pd.Series(y_pred, name="Predicted", index=test_ids)), axis=1).head()









    Out[42]:






  
    
      Gender
      Name
      BestGender
      Predicted
    
  
  
    
      24482
      Santanna
      F
      F
    
    
      19861
      Merisa
      F
      F
    
    
      23753
      Roneisha
      F
      F
    
    
      25650
      Sheyla
      F
      F
    
    
      3830
      Boston
      M
      M



In [43]:

    
df_model_coeffs = pd.DataFrame({"Feature": cvec.get_feature_names(), "Coeff": model.coef_[0]})



In [44]:

    
df_model_coeffs["Feature_len"] = df_model_coeffs["Feature"].apply(lambda x: len(x))
df_model_coeffs.sort("Coeff", ascending=False).head(20)









    Out[44]:






  
    
      
      Coeff
      Feature
      Feature_len
    
  
  
    
      1258
      4.605880
      Shlo
      4
    
    
      6103
      3.106888
      siah
      4
    
    
      1509
      3.013812
      ^Abd
      4
    
    
      5109
      2.542565
      ndel
      4
    
    
      5638
      2.505959
      rag
      3
    
    
      6844
      2.485604
      zai
      3
    
    
      6404
      2.466336
      uda
      3
    
    
      4613
      2.463779
      kw
      2
    
    
      938
      2.416480
      Mark
      4
    
    
      6592
      2.324890
      vil
      3
    
    
      2430
      2.314680
      ^Zac
      4
    
    
      5561
      2.296611
      ovan
      4
    
    
      4003
      2.264858
      hay
      3
    
    
      5543
      2.264134
      ota
      3
    
    
      6773
      2.257093
      yne$
      4
    
    
      975
      2.242189
      Merl
      4
    
    
      6275
      2.241032
      tho
      3
    
    
      934
      2.231238
      Marc
      4
    
    
      5486
      2.209771
      ord
      3
    
    
      2797
      2.203274
      anni
      4



In [45]:

    
df_names[df_names.Name_proc.str.contains("ndel")]









    Out[45]:






  
    
      Gender
      Name
      F
      M
      Total
      MaxCount
      BestGender
      Name_proc
    
  
  
    
      1735
      Andelyn
      10
      0
      10
      10
      F
      ^Andelyn$
    
    
      3761
      Blondell
      631
      0
      631
      631
      F
      ^Blondell$
    
    
      3896
      Brandel
      6
      0
      6
      6
      F
      ^Brandel$
    
    
      4003
      Breindel
      69
      0
      69
      69
      F
      ^Breindel$
    
    
      4615
      Candelaria
      1328
      0
      1328
      1328
      F
      ^Candelaria$
    
    
      4616
      Candelario
      0
      1565
      1565
      1565
      M
      ^Candelario$
    
    
      10368
      Glendel
      0
      20
      20
      20
      M
      ^Glendel$
    
    
      10369
      Glendell
      0
      24
      24
      24
      M
      ^Glendell$
    
    
      11084
      Hendel
      29
      0
      29
      29
      F
      ^Hendel$
    
    
      11243
      Hindel
      5
      0
      5
      5
      F
      ^Hindel$
    
    
      12551
      Jandel
      0
      51
      51
      51
      M
      ^Jandel$
    
    
      15409
      Kendel
      15
      20
      35
      20
      M
      ^Kendel$
    
    
      15410
      Kendell
      235
      1683
      1918
      1683
      M
      ^Kendell$
    
    
      15924
      Kindell
      5
      0
      5
      5
      F
      ^Kindell$
    
    
      16403
      Kyndel
      40
      0
      40
      40
      F
      ^Kyndel$
    
    
      16404
      Kyndell
      15
      0
      15
      15
      F
      ^Kyndell$
    
    
      17597
      Lindell
      0
      906
      906
      906
      M
      ^Lindell$
    
    
      17791
      Londell
      0
      11
      11
      11
      M
      ^Londell$
    
    
      18150
      Lyndel
      0
      5
      5
      5
      M
      ^Lyndel$
    
    
      18151
      Lyndell
      27
      245
      272
      245
      M
      ^Lyndell$
    
    
      18718
      Mandel
      0
      5
      5
      5
      M
      ^Mandel$
    
    
      18719
      Mandela
      0
      10
      10
      10
      M
      ^Mandela$
    
    
      19823
      Mendel
      0
      1383
      1383
      1383
      M
      ^Mendel$
    
    
      20156
      Mindel
      49
      0
      49
      49
      F
      ^Mindel$
    
    
      22622
      Quandell
      0
      5
      5
      5
      M
      ^Quandell$
    
    
      22684
      Quindell
      0
      5
      5
      5
      M
      ^Quindell$
    
    
      22962
      Randel
      0
      1139
      1139
      1139
      M
      ^Randel$
    
    
      22963
      Randell
      0
      6228
      6228
      6228
      M
      ^Randell$
    
    
      23310
      Rendell
      0
      11
      11
      11
      M
      ^Rendell$
    
    
      23746
      Rondel
      0
      5
      5
      5
      M
      ^Rondel$
    
    
      23747
      Rondell
      0
      649
      649
      649
      M
      ^Rondell$
    
    
      24915
      Shaindel
      483
      0
      483
      483
      F
      ^Shaindel$
    
    
      25073
      Shandel
      11
      0
      11
      11
      F
      ^Shandel$
    
    
      25074
      Shandell
      26
      0
      26
      26
      F
      ^Shandell$
    
    
      25393
      Shawndell
      7
      21
      28
      21
      M
      ^Shawndell$
    
    
      25472
      Sheindel
      31
      0
      31
      31
      F
      ^Sheindel$
    
    
      25767
      Shondell
      13
      18
      31
      18
      M
      ^Shondell$
    
    
      28818
      Vondell
      10
      0
      10
      10
      F
      ^Vondell$
    
    
      28948
      Wendel
      0
      143
      143
      143
      M
      ^Wendel$
    
    
      28949
      Wendelin
      5
      21
      26
      21
      M
      ^Wendelin$
    
    
      28950
      Wendell
      0
      48363
      48363
      48363
      M
      ^Wendell$
    
    
      28951
      Wendelyn
      5
      0
      5
      5
      F
      ^Wendelyn$
    
    
      29078
      Windell
      0
      1076
      1076
      1076
      M
      ^Windell$
    
    
      29149
      Wyndell
      0
      15
      15
      15
      M
      ^Wyndell$
    
    
      29320
      Yandel
      0
      2509
      2509
      2509
      M
      ^Yandel$
    
    
      29321
      Yandell
      0
      5
      5
      5
      M
      ^Yandell$



In [ ]:

    
df_sample = df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]]

Train model on name character ngram + year feature

Add year feature



In [47]:

    
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_props_filtered.Name_proc)
cvec.transform(df_props_filtered.Name_proc.head()).todense().shape









    Out[47]:





(5, 3098)



In [48]:

    
class ColumnFeatures(TransformerMixin):
    def __init__(self, colname, to_df=True):
        print "Initialized extractor for column %s" % colname
        self.colname = colname
        self.to_df = to_df
    def get_feature_names(self):
        return [self.colname]
    def transform(self, X, **transform_params):
        print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
        if self.to_df:
            return pd.DataFrame(X[self.colname])
        return X[self.colname]
    def fit(self, X, y=None, **fit_params):
        return self


class IdentityTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "X processed by parent. Output shape: %s" % (X.shape, )
        return X
    def fit(self, X, y=None, **fit_params):
        return self
    
class DenseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "New shape: ",  X.todense().shape
        return X.todense()
    def fit(self, X, y=None, **fit_params):
        return self
    

class MultiColumnExtractor(TransformerMixin):
    def __init__(self, colnames):
        print "Initialized extractor for column %s" % colnames
        self.colnames = colnames
    def get_feature_names(self):
        return self.colnames
    def transform(self, X, **transform_params):
        print "Extracting columns [%s]" % (self.colnames,)
        return pd.DataFrame(X[self.colnames])
    def fit(self, X, y=None, **fit_params):
        return self
        
pipeline = Pipeline([
        ("features", FeatureUnion([
                    ("year",  Pipeline([
                                ("year_val", ColumnFeatures(colname="Year")),
                                ("year_norm", StandardScaler())
                    ])),
                    ("state", Pipeline([
                               ("state_val", ColumnFeatures(colname="State")),
                               ("state_cat", LabelBinarizer())
                    ])),
                    ("names", Pipeline([
                               ("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
                               ("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
                    ])),
                    
                ]))
    ])









    



Initialized extractor for column Year
Initialized extractor for column State
Initialized extractor for column Name_proc



In [49]:

    
pipeline.fit_transform(df_props_filtered).shape









    



Extracting column [Year], to_df = True
Extracting column [State], to_df = True
Extracting column [Name_proc], to_df = False






    



/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:498: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))






    Out[49]:





(59777, 3141)



In [50]:

    
df_props_filtered.values[:10]









    Out[50]:





array([['Aaliyah', 'CA', 2009, 556.0, 0.0, 556.0, 556.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2010, 563.0, 0.0, 563.0, 563.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2011, 678.0, 0.0, 678.0, 678.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2012, 737.0, 0.0, 737.0, 737.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2013, 706.0, 0.0, 706.0, 706.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2014, 695.0, 0.0, 695.0, 695.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2010, 555.0, 0.0, 555.0, 555.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2011, 549.0, 0.0, 549.0, 549.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2012, 682.0, 0.0, 682.0, 682.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2013, 603.0, 0.0, 603.0, 603.0, 'F', '^Aaliyah$']], dtype=object)



In [51]:

    
model = LogisticRegression(penalty="l1")

np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape

X_train, y_train = pipeline.transform(df_props_filtered.iloc[train_ids]), df_props_filtered.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_props_filtered.iloc[test_ids]), df_props_filtered.iloc[test_ids].BestGender









    



Extracting column [Year], to_df = True
Extracting column [State], to_df = True
Extracting column [Name_proc], to_df = False
Extracting column [Year], to_df = True
Extracting column [State], to_df = True
Extracting column [Name_proc], to_df = False



In [52]:

    
model.fit(X_train, y_train)









    Out[52]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)



In [53]:

    
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)









    



             precision    recall  f1-score   support

          F       1.00      1.00      1.00      5723
          M       1.00      1.00      1.00      6387

avg / total       1.00      1.00      1.00     12110



In [54]:

    
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": ["Year"] + f.transformer_list[1][1].named_steps["state_cat"].classes_.tolist() + f.transformer_list[2][1].named_steps["name_ngram"].get_feature_names(), "Coeff": model.coef_[0]})



In [55]:

    
df_model_coeffs.sort("Coeff", ascending=True).head(20)



In [61]:

    
pipeline = Pipeline([
        ("features", FeatureUnion([
                    ("year_latlong",  Pipeline([
                                ("year_val", MultiColumnExtractor(colnames=["Year", "Latitude", "Longitude"])),
                                ("year_norm", StandardScaler())
                    ])),
                    ("names", Pipeline([
                               ("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
                               ("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
                    ])),
                    
                ]))
    ])









    



Initialized extractor for column ['Year', 'Latitude', 'Longitude']
Initialized extractor for column Name_proc



In [62]:

    
model = LogisticRegression(penalty="l1")

np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape

X_train, y_train = pipeline.fit_transform(df_names_latlong.iloc[train_ids]), df_names_latlong.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_names_latlong.iloc[test_ids]), df_names_latlong.iloc[test_ids].BestGender









    



Extracting columns [['Year', 'Latitude', 'Longitude']]
Extracting column [Name_proc], to_df = False
Extracting columns [['Year', 'Latitude', 'Longitude']]
Extracting column [Name_proc], to_df = False



In [63]:

    
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)









    



             precision    recall  f1-score   support

          F       1.00      1.00      1.00      5723
          M       1.00      1.00      1.00      6387

avg / total       1.00      1.00      1.00     12110



In [64]:

    
df_props.head()



In [65]:

    
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": f.transformer_list[0][1].named_steps["year_val"].get_feature_names()\
                                + f.transformer_list[1][1].named_steps["name_ngram"].get_feature_names(),\
                                "Coeff": model.coef_[0]})



In [66]:

    
f = pipeline.named_steps["features"]



In [67]:

    
f.transformer_list[0][1].named_steps["year_val"].get_feature_names()









    Out[67]:





['Year', 'Latitude', 'Longitude']



In [68]:

    
df_model_coeffs.head()



In [69]:

    
df_t = df_props.groupby("Name")[["F", "M"]].max()
df_t.head()



In [70]:

    
df_t["Prop"] = np.abs(df_t["F"] - df_t["M"]) / (df_t["F"] + df_t["M"])
df_t[(df_t["Prop"] < 0.1) & ((df_t["F"] + df_t["M"]) > 100)].sort("Prop")



In [71]:

    
df_t = df_props.groupby(["Name", "Year"])[["F", "M"]].sum().reset_index()
df_t["BestGender"] = df_t[["F", "M"]].idxmax(axis=1)



In [72]:

    
df_t.head()



In [80]:

    
clust_model = AgglomerativeClustering(n_clusters=10)



In [81]:

    
cids = clust_model.fit_predict(cvec.transform(df_names.Name_proc.head(100)).todense())



In [82]:

    
cids









    Out[82]:





array([0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 6, 6,
       6, 9, 6, 6, 9, 5, 9, 5, 6, 6, 8, 8, 8, 8, 5, 0, 0, 0, 5, 0, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 1, 1, 1, 4, 4, 4, 4, 4, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 7, 0, 0, 0, 3,
       3, 3, 3, 3, 3, 3, 3, 3])



In [90]:

    
pd.concat((df_names.Name.head(100), pd.Series(cids, name="ClusterIDs", index=df_names.Name_proc.head(100).index)), axis=1).head()



In [ ]:

	Coeff	Feature
1253	-12.382296	a$
2387	-6.719285	ne
2984	-6.560792	ud
2031	-6.517110	ie$
1958	-6.246937	hl
2955	-6.048778	tt
171	-5.078938	Caro
82	-4.331077	Am
3130	-4.048076	ys
448	-3.754964	Joan
1929	-3.567873	he
1258	-3.542252	abe
2295	-3.527388	ly
427	-3.521152	Jea
1038	-3.485361	^Jo$
2330	-3.472253	min$
2258	-3.391106	lle
1714	-3.378617	el
1912	-3.346282	h$
845	-3.209122	^Ann

Gender	F	M	Prop
Name
Unnamed	79	79	0.000000
Notnamed	108	109	0.004608
Justice	80	79	0.006289
Charley	55	54	0.009174
Infant	544	560	0.014493
Emory	65	67	0.015152
Johnnie	268	257	0.020952
Stevie	52	55	0.028037
Tommie	117	124	0.029046
Jaylin	64	59	0.040650
Unknown	136	149	0.045614
Armani	59	65	0.048387
Darian	83	69	0.092105
Jaime	753	621	0.096070
Frankie	117	96	0.098592

	0	1	2	3	4
0	AK	F	1910	Mary	14
1	AK	F	1910	Annie	12
2	AK	F	1910	Anna	10
3	AK	F	1910	Margaret	8
4	AK	F	1910	Helen	7

Gender	F			M
	Count	Name	Year	Count	Name	Year
State
AK	164	Jessica	1985	207	Michael	1982
AL	2694	Mary	1925	3038	James	1947
AR	1424	Linda	1948	1620	James	1947
AZ	875	Jessica	1987	1045	Michael	1984
CA	6949	Jessica	1991	8268	Michael	1956
CO	1036	Linda	1948	973	Michael	1970
CT	1155	Linda	1947	1622	Robert	1947
DC	615	Linda	1949	884	John	1947
DE	207	Jennifer	1974	311	Michael	1958
FL	2739	Jessica	1987	3491	Michael	1990
GA	2614	Mary	1922	3066	James	1947
HI	243	Jennifer	1981	325	Michael	1957
IA	2365	Linda	1947	1667	Robert	1924
ID	530	Linda	1947	348	Michael	1954
IL	5285	Linda	1947	6220	Michael	1957
IN	3012	Linda	1947	2781	Michael	1954
KS	1387	Linda	1947	1184	Michael	1954
KY	2537	Mary	1924	2553	James	1947
LA	1648	Linda	1947	1667	Michael	1958
MA	3035	Mary	1921	3895	Robert	1947
MD	1505	Linda	1947	1789	Michael	1957
ME	778	Linda	1947	649	Robert	1947
MI	4538	Linda	1947	4924	Michael	1957
MN	2144	Linda	1948	2043	David	1955
MO	2797	Linda	1947	2430	James	1947
MS	1840	Mary	1927	2278	James	1947
MT	472	Linda	1948	403	John	1947
NC	3126	Mary	1924	3884	James	1947
ND	496	Linda	1947	344	David	1952
NE	1055	Linda	1947	937	Robert	1924
NH	437	Linda	1947	447	Robert	1947
NJ	2398	Jennifer	1972	3699	John	1964
NM	589	Mary	1947	543	David	1961
NV	268	Jessica	1988	341	Michael	1990
NY	7542	Linda	1947	10023	Robert	1947
OH	5881	Linda	1947	5492	David	1955
OK	2064	Linda	1947	1273	James	1925
OR	1121	Linda	1947	984	Michael	1953
PA	8184	Mary	1918	8048	John	1947
RI	485	Mary	1921	682	Robert	1947
SC	1742	Mary	1924	2314	James	1947
SD	507	Linda	1948	413	Robert	1925
TN	2785	Linda	1947	3279	James	1947
TX	5060	Linda	1947	4983	Christopher	1985
UT	702	Jennifer	1972	657	Michael	1980
VA	2307	Linda	1948	2676	James	1947
VT	325	Linda	1947	262	David	1955
WA	1751	Linda	1948	1620	Michael	1954
WI	2399	Mary	1954	2400	David	1955
WV	1795	Linda	1947	1682	James	1947
WY	229	Linda	1948	194	David	1955

Gender	Name	State	Year	M
0	Aaban	NY	2013	6
1	Aaban	NY	2014	6
2	Aadan	CA	2008	7
3	Aadan	CA	2009	6
4	Aadan	CA	2014	5

	Year	F	M	Total	MaxCount
count	5448518.000000	5448518.000000	5448518.000000	5448518.000000	5448518.000000
mean	1972.316033	26.387006	28.468888	54.855894	54.256168
std	29.617741	109.590313	152.552096	185.000905	183.863425
min	1910.000000	0.000000	0.000000	5.000000	5.000000
25%	1949.000000	0.000000	0.000000	7.000000	7.000000
50%	1977.000000	6.000000	0.000000	13.000000	13.000000
75%	1999.000000	15.000000	12.000000	35.000000	35.000000
max	2014.000000	8184.000000	10023.000000	10046.000000	10023.000000

Gender	Name	M	Total	MaxCount	BestGender
0	Aaban	12	12	12	M
1	Aadan	23	23	23	M
2	Aadarsh	5	5	5	M
3	Aaden	3426	3426	3426	M
4	Aadhav	6	6	6	M

Gender	Name	BestGender	Predicted
24482	Santanna	F	F
19861	Merisa	F	F
23753	Roneisha	F	F
25650	Sheyla	F	F
3830	Boston	M	M

	Coeff	Feature	Feature_len
1258	4.605880	Shlo	4
6103	3.106888	siah	4
1509	3.013812	^Abd	4
5109	2.542565	ndel	4
5638	2.505959	rag	3
6844	2.485604	zai	3
6404	2.466336	uda	3
4613	2.463779	kw	2
938	2.416480	Mark	4
6592	2.324890	vil	3
2430	2.314680	^Zac	4
5561	2.296611	ovan	4
4003	2.264858	hay	3
5543	2.264134	ota	3
6773	2.257093	yne$	4
975	2.242189	Merl	4
6275	2.241032	tho	3
934	2.231238	Marc	4
5486	2.209771	ord	3
2797	2.203274	anni	4

Gender	Name	F	M	Total	MaxCount	BestGender	Name_proc
1735	Andelyn	10	0	10	10	F	^Andelyn$
3761	Blondell	631	0	631	631	F	^Blondell$
3896	Brandel	6	0	6	6	F	^Brandel$
4003	Breindel	69	0	69	69	F	^Breindel$
4615	Candelaria	1328	0	1328	1328	F	^Candelaria$
4616	Candelario	0	1565	1565	1565	M	^Candelario$
10368	Glendel	0	20	20	20	M	^Glendel$
10369	Glendell	0	24	24	24	M	^Glendell$
11084	Hendel	29	0	29	29	F	^Hendel$
11243	Hindel	5	0	5	5	F	^Hindel$
12551	Jandel	0	51	51	51	M	^Jandel$
15409	Kendel	15	20	35	20	M	^Kendel$
15410	Kendell	235	1683	1918	1683	M	^Kendell$
15924	Kindell	5	0	5	5	F	^Kindell$
16403	Kyndel	40	0	40	40	F	^Kyndel$
16404	Kyndell	15	0	15	15	F	^Kyndell$
17597	Lindell	0	906	906	906	M	^Lindell$
17791	Londell	0	11	11	11	M	^Londell$
18150	Lyndel	0	5	5	5	M	^Lyndel$
18151	Lyndell	27	245	272	245	M	^Lyndell$
18718	Mandel	0	5	5	5	M	^Mandel$
18719	Mandela	0	10	10	10	M	^Mandela$
19823	Mendel	0	1383	1383	1383	M	^Mendel$
20156	Mindel	49	0	49	49	F	^Mindel$
22622	Quandell	0	5	5	5	M	^Quandell$
22684	Quindell	0	5	5	5	M	^Quindell$
22962	Randel	0	1139	1139	1139	M	^Randel$
22963	Randell	0	6228	6228	6228	M	^Randell$
23310	Rendell	0	11	11	11	M	^Rendell$
23746	Rondel	0	5	5	5	M	^Rondel$
23747	Rondell	0	649	649	649	M	^Rondell$
24915	Shaindel	483	0	483	483	F	^Shaindel$
25073	Shandel	11	0	11	11	F	^Shandel$
25074	Shandell	26	0	26	26	F	^Shandell$
25393	Shawndell	7	21	28	21	M	^Shawndell$
25472	Sheindel	31	0	31	31	F	^Sheindel$
25767	Shondell	13	18	31	18	M	^Shondell$
28818	Vondell	10	0	10	10	F	^Vondell$
28948	Wendel	0	143	143	143	M	^Wendel$
28949	Wendelin	5	21	26	21	M	^Wendelin$
28950	Wendell	0	48363	48363	48363	M	^Wendell$
28951	Wendelyn	5	0	5	5	F	^Wendelyn$
29078	Windell	0	1076	1076	1076	M	^Windell$
29149	Wyndell	0	15	15	15	M	^Wyndell$
29320	Yandel	0	2509	2509	2509	M	^Yandel$
29321	Yandell	0	5	5	5	M	^Yandell$

	Coeff	Feature
0	0.706100	Year
1	-0.073698	Latitude
2	-0.102547	Longitude
3	0.000000	Aa
4	0.000000	Aar